In [31]:
import pandas as pd
from textblob import TextBlob
import nltk
from spacy.lang.en import STOP_WORDS
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
nlp=spacy.load("en_core_web_md")
In [32]:
english_stopwords=STOP_WORDS
In [33]:
#loading the dataset
data=pd.read_csv("assignment.csv")
In [34]:
#reviewing top 10 rows
data.head(10)
Out[34]:
unique_id raw_text review_text
0 0 Spiritually and mentally inspiring! A book tha... Menginspirasi secara spiritual dan mental! Buk...
1 1 This is one my must have books Ini adalah salah satu yang harus saya miliki buku
2 2 It is a masterpiece of spirituality Itu adalah mahakarya spiritualitas
3 3 I'll be the first to admit, its literary qual... Saya akan menjadi yang pertama mengakui, kuali...
4 4 It is rather simplistically written, but the ... Ini ditulis agak sederhana, tetapi pesan di ba...
5 5 It will take you to enlightenment Ini akan membawa Anda menuju pencerahan
6 7 This book provides a reflection that you can a... Buku ini memberikan refleksi yang dapat Anda t...
7 8 And, a way for you to try and assess whether y... Dan, cara bagi Anda untuk mencoba dan menilai ...
8 10 I first read THE PROPHET in college back in th... Saya pertama kali membaca THE NABI di pergurua...
9 11 The book had a revival as did anything metaph... Buku itu memiliki kebangkitan seperti halnya s...
In [35]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53757 entries, 0 to 53756
Data columns (total 3 columns):
unique_id      53757 non-null int64
raw_text       53757 non-null object
review_text    53682 non-null object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
In [36]:
#Finding Uniques in each Columns of Training set

for i in data.columns:
    print('Total unique in',i,'=',data[i].nunique())
Total unique in unique_id = 53757
Total unique in raw_text = 48242
Total unique in review_text = 47782
In [37]:
#Finding Uniques in each Columns of Training set

for i in data.columns:
    print('Total null values in',i,'=',data[i].isnull().sum())
    
data.isnull().sum().plot(kind='bar')
Total null values in unique_id = 0
Total null values in raw_text = 0
Total null values in review_text = 75
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x21d96680f28>

Review_text has 75 null values

Converting all the text data into lowercase

In [38]:
# data['raw_text']=data['raw_text'].str.lower()
# data['review_text']=data['review_text'].str.lower()

As we have seen that there are 2 text columns are given i.e raw_text and review_text and both are in different languages. column with raw_text has English but the review_text has some other language which we don't know. So, we have to identify that language

In [39]:
#function to detect Language
def detect_lang(text):
    a=TextBlob(str(text))
    result=a.detect_language()
    return result

print("detected language of raw_text is:  ",detect_lang(data['raw_text']))
print("\n detected language of review_text is:  ",detect_lang(data['review_text']))
detected language of raw_text is:   en

 detected language of review_text is:   id

From above output we get to know that other language is Indonesian. Now we will see that the information present in both raw_text and review_text are same or different. We will convert first 5 sentences of Indonesian to English

In [40]:
#function to convert Language
def convert(text):
    a=TextBlob(str(text))
    result=a.translate(to='en')
    return result

#Reviewing top 5 sentences from English and comparing it with Indonesian language
print("top 5 sentences of raw_text is: \n ",data['raw_text'][:10])

print("\n top 5 sentences of review_text is:\n  ",convert(data['review_text'][:10]))
top 5 sentences of raw_text is: 
  0    Spiritually and mentally inspiring! A book tha...
1                       This is one my must have books
2                  It is a masterpiece of spirituality
3     I'll be the first to admit, its literary qual...
4     It is rather simplistically written, but the ...
5                    It will take you to enlightenment
6    This book provides a reflection that you can a...
7    And, a way for you to try and assess whether y...
8    I first read THE PROPHET in college back in th...
9     The book had a revival as did anything metaph...
Name: raw_text, dtype: object

 top 5 sentences of review_text is:
   0 Inspire spiritually and mentally! Buk ...
1 This is one of the must-have books
2 It is a masterpiece of spirituality
3 I will be the first to admit, quality ...
4 It's written rather simply, but the message below ...
5 This will bring you to enlightenment
6 This book gives reflections that you can t ...
7 And, a way for you to try and judge ...
8 I first read THE PROPHET in college ...
9 The book has a resurrection just like s ...
Name: review_text, dtype: object
In [41]:
data_english=pd.DataFrame(data['raw_text'])
data_indonesia=pd.DataFrame(data['review_text'])

Finding similarity between the two documents using Spacy

In [42]:
doc_eng=nlp(str(data_english))
doc_indo=nlp(str(data_indonesia))
In [43]:
a=doc_eng.similarity(doc_indo)
print('% of similar document: ',np.round(a*100,decimals=2))
% of similar document:  26.71
In [44]:
labels=['Similarity', 'Not Similar' ]
sizes = [0.27,0.74]
explode = (0.1,0)  # explode 1st slice

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=plt.cm.Set2.colors,autopct='%2.2f%%', shadow=True, startangle=140)

plt.axis('equal')
plt.show()

From the graph we have seen that only 26.73% of Text is similar

First dealing with the English data

In [45]:
import string
def feature(data) :
## Finding the total number of words present in the each sentences
    data['word_count'] = data['raw_text'].apply(lambda x : len(x.split()))
#total number of Characters in a sentence
    data['char_count'] = data['raw_text'].apply(lambda x : len(x.replace(" ","")))
#calculating word density
    data['word_density'] = data['word_count'] / (data['char_count'] + 1)
#calculating punctuations in sentences
    data['punctuation']=data['raw_text'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
# calculating total length of sentence
    data['total_length'] = data['raw_text'].apply(len)
#Total number of Numeric characters
    data['numerics']=data['raw_text'].apply(lambda x: len([x for x in x if x.isdigit()]))
#findng total number of stopwords in each sentence 
    data['stopwords']=data['raw_text'].apply(lambda x:len([x for x in x.lower().split() if x in STOP_WORDS]))
# finding total number of capital letters in sentence
    data['capitals']=data['raw_text'].apply(lambda x: len([x for x in x.split(" ") if x.isupper()]))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
#calculating total number of exclamation_marks in sentence
    data['num_exclamation_marks'] =data['raw_text'].apply(lambda x: x.count('!'))
#calculating total number of question_marks in sentence
    data['num_question_marks'] = data['raw_text'].apply(lambda x: x.count('?'))
#calculating punctuations in sentences
    data['num_punctuation'] = data['raw_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
#calculating total number of symbols in sentences
    data['num_symbols'] = data['raw_text'].apply(lambda x: sum(x.count(w) for w in '*&#$%'))
#calculating total number of unique words in sentences
    data['num_unique_words'] = data['raw_text'].apply(lambda x: len(set(w for w in x.split())))
#calculating words_vs_unique
    data['words_vs_unique'] = data['num_unique_words'] / data['word_count']
#calculating word_unique_percent
    data["word_unique_percent"] =  data["num_unique_words"]*100/data['word_count']
#calculating Sentiment
    data['polarity']=data['raw_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
    return data
In [46]:
feature(data_english)
Out[46]:
raw_text word_count char_count word_density punctuation total_length numerics stopwords capitals caps_vs_length num_exclamation_marks num_question_marks num_punctuation num_symbols num_unique_words words_vs_unique word_unique_percent polarity
0 Spiritually and mentally inspiring! A book tha... 22 108 0.201835 0 129 0 12 1 0.007752 2 0 0 0 19 0.863636 86.363636 0.291667
1 This is one my must have books 7 24 0.280000 0 30 0 6 0 0.000000 0 0 0 0 7 1.000000 100.000000 0.000000
2 It is a masterpiece of spirituality 6 30 0.193548 0 36 0 4 0 0.000000 0 0 0 0 6 1.000000 100.000000 0.000000
3 I'll be the first to admit, its literary qual... 11 49 0.220000 0 60 0 6 0 0.000000 0 0 1 0 11 1.000000 100.000000 0.183333
4 It is rather simplistically written, but the ... 19 84 0.223529 0 103 0 14 0 0.000000 0 0 1 0 17 0.894737 89.473684 -0.050000
5 It will take you to enlightenment 6 28 0.206897 0 34 0 5 0 0.000000 0 0 0 0 6 1.000000 100.000000 0.000000
6 This book provides a reflection that you can a... 13 55 0.232143 0 67 0 8 0 0.000000 0 0 0 0 13 1.000000 100.000000 0.600000
7 And, a way for you to try and assess whether y... 28 106 0.261682 0 133 0 17 0 0.000000 0 0 1 0 25 0.892857 89.285714 0.261905
8 I first read THE PROPHET in college back in th... 11 42 0.255814 0 52 2 7 3 0.057692 0 0 0 0 10 0.909091 90.909091 0.125000
9 The book had a revival as did anything metaph... 13 61 0.209677 0 74 2 8 0 0.000000 0 0 0 0 13 1.000000 100.000000 0.000000
10 It had a profound effect on me and became a b... 16 55 0.285714 0 71 0 12 1 0.014085 0 0 0 0 14 0.875000 87.500000 0.083333
11 After graduation I joined the Peace Corps and... 30 144 0.206897 0 174 0 14 2 0.011494 0 0 1 0 27 0.900000 90.000000 0.400000
12 I read it before I married, just before and a... 22 97 0.224490 0 119 0 15 2 0.016807 0 0 1 0 17 0.772727 77.272727 0.175000
13 I am always amazed that there is a chapter th... 25 101 0.245098 0 126 0 16 1 0.007937 0 0 1 0 23 0.920000 92.000000 0.050000
14 Gibran offers timeless insights and love with ... 9 47 0.187500 0 55 0 3 0 0.000000 0 0 0 0 9 1.000000 100.000000 0.300000
15 I think that we as a nation should read AND l... 14 53 0.259259 0 67 0 9 2 0.029851 0 0 0 0 14 1.000000 100.000000 0.000000
16 It is definitely a time for thought and refle... 15 69 0.214286 0 84 0 9 0 0.000000 0 0 0 0 15 1.000000 100.000000 0.000000
17 A timeless classic 3 16 0.176471 0 18 0 1 1 0.055556 0 0 0 0 3 1.000000 100.000000 0.166667
18 It is a very demanding and assuming title, b... 19 85 0.220930 0 105 0 11 0 0.000000 0 0 1 0 18 0.947368 94.736842 0.600000
19 If he had the means to publish it a century ... 20 78 0.253165 0 99 0 13 0 0.000000 0 0 1 0 18 0.900000 90.000000 0.068182
20 From the mouth of an old man about to sail awa... 28 106 0.261682 0 133 0 15 0 0.000000 0 0 1 0 23 0.821429 82.142857 0.200000
21 It is a messege 4 12 0.307692 0 17 0 3 0 0.000000 0 0 0 0 4 1.000000 100.000000 0.000000
22 A guide book 3 10 0.272727 0 14 0 1 1 0.071429 0 0 0 0 3 1.000000 100.000000 0.000000
23 A Sufi sermon 3 11 0.250000 0 15 0 1 1 0.066667 0 0 0 0 3 1.000000 100.000000 -0.225000
24 Much is put in perspective without any hint o... 11 44 0.244444 0 55 0 8 0 0.000000 0 0 0 0 11 1.000000 100.000000 0.200000
25 There is much that hints at his birth place,... 28 130 0.213740 0 159 0 16 0 0.000000 0 0 1 0 26 0.928571 92.857143 0.258333
26 Probably becuase it was written in English ori... 29 140 0.205674 0 168 0 14 0 0.000000 0 0 3 0 24 0.827586 82.758621 0.369444
27 I loved the cover 4 14 0.266667 0 19 0 2 1 0.052632 0 0 0 0 4 1.000000 100.000000 0.700000
28 Reading this made my mind feel like a still po... 19 73 0.256757 0 91 0 9 0 0.000000 0 0 1 0 18 0.947368 94.736842 0.175000
29 It's direct and simple wisdom has a depth of... 22 90 0.241758 0 113 0 9 0 0.000000 0 0 1 0 21 0.954545 95.454545 0.033333
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
53727 The screen which looks outstanding inside unde... 27 149 0.180000 0 175 0 14 0 0.000000 0 0 2 0 25 0.925926 92.592593 0.087500
53728 So there you have my one gripe 7 24 0.280000 0 31 0 6 0 0.000000 0 0 0 0 7 1.000000 100.000000 0.000000
53729 I will come back and update this if anything c... 30 114 0.260870 0 143 0 22 2 0.013986 0 0 1 0 25 0.833333 83.333333 -0.021190
53730 UPDATE: 4/08/08My wife and I still have both o... 46 196 0.233503 0 241 6 27 2 0.008299 0 0 3 0 39 0.847826 84.782609 -0.067857
53731 The only slight problem we have had in over 2 ... 57 219 0.259091 0 275 4 37 0 0.000000 0 0 0 1 44 0.771930 77.192982 0.186905
53732 These phones have preformed flawlessly and we ... 13 58 0.220339 0 70 0 8 0 0.000000 0 0 0 0 13 1.000000 100.000000 1.000000
53733 I live in an area where Cingular service is re... 18 85 0.209302 0 102 0 9 2 0.019608 0 0 1 0 17 0.944444 94.444444 0.545455
53734 The phone itself is not one that I picked or... 12 56 0.210526 0 69 0 8 1 0.014493 0 0 0 0 12 1.000000 100.000000 0.000000
53735 My son tossed my old phone into the wash and... 25 105 0.235849 0 131 0 13 2 0.015267 0 0 0 0 22 0.880000 88.000000 0.044444
53736 It is longer and wider than my old Motorola ... 15 58 0.254237 0 74 0 9 0 0.000000 0 0 0 0 14 0.933333 93.333333 0.100000
53737 I was concerned about the thickness and dura... 8 43 0.181818 0 52 0 5 1 0.019231 0 0 0 0 8 1.000000 100.000000 0.000000
53738 This phone is also metal so I feel more conf... 19 78 0.240506 0 98 0 12 2 0.020408 0 0 0 0 17 0.894737 89.473684 0.277778
53739 My old accessories don't fit even though it is... 25 120 0.206612 0 144 1 14 0 0.000000 0 1 0 0 25 1.000000 100.000000 0.000000
53740 0 by Avanquest for my previous phone 7 30 0.225806 0 36 1 3 0 0.000000 0 0 0 0 7 1.000000 100.000000 -0.166667
53741 I plugged my new phone in and the program au... 13 56 0.228070 0 70 1 6 1 0.014286 0 0 0 0 13 1.000000 100.000000 0.136364
53742 0 and it prompted me to add a profile for my c... 13 46 0.276596 0 58 1 7 0 0.000000 0 0 0 0 13 1.000000 100.000000 0.000000
53743 With this program I can make my own ring ton... 34 130 0.259542 0 165 0 20 2 0.012121 0 0 0 0 28 0.823529 82.352941 0.600000
53744 I also put my favorite picture of my phone d... 24 92 0.258065 0 117 0 15 2 0.017094 0 0 0 0 19 0.791667 79.166667 0.500000
53745 My Bluetooth accessories work better with this... 12 67 0.176471 0 78 0 5 0 0.000000 0 0 0 0 12 1.000000 100.000000 0.166667
53746 On that phone the connection between the ear... 15 81 0.182927 0 97 0 8 0 0.000000 0 0 0 0 12 0.800000 80.000000 0.000000
53747 I could not hear anyone through the earpiece... 35 138 0.251799 1 174 0 24 3 0.017241 0 0 0 0 28 0.800000 80.000000 -0.800000
53748 The Razr and my Jabra work well together 8 33 0.235294 0 40 0 5 0 0.000000 0 0 0 0 8 1.000000 100.000000 0.000000
53749 It works great for voice dialing in my exper... 37 154 0.238710 1 192 0 23 3 0.015625 0 0 0 0 34 0.918919 91.891892 0.366667
53750 So I am very happy with this phone but in fair... 25 109 0.227273 0 133 0 17 3 0.022556 0 0 0 0 21 0.840000 84.000000 0.291667
53751 I had a Samsung X427 before this and it was awful 11 39 0.275000 0 49 3 8 2 0.040816 0 0 0 0 11 1.000000 100.000000 -1.000000
53752 Dropped calls, poor reception, wouldn't hold ... 8 46 0.170213 0 54 0 1 0 0.000000 0 0 2 0 8 1.000000 100.000000 -0.400000
53753 This phone changed my opinion completely 6 35 0.166667 0 41 0 2 0 0.000000 0 0 0 0 6 1.000000 100.000000 0.100000
53754 It does everything I need and then some, and ... 14 59 0.233333 0 73 0 10 1 0.013699 0 0 1 0 13 0.928571 92.857143 0.000000
53755 The only downside and the difference between ... 32 141 0.225352 1 173 2 18 0 0.000000 0 0 0 1 29 0.906250 90.625000 -0.050000
53756 But speakerphone is not really what I needed,... 13 65 0.196970 0 78 0 9 1 0.012821 0 0 1 0 13 1.000000 100.000000 -0.100000

53757 rows × 18 columns

In [47]:
data_english.describe()
Out[47]:
word_count char_count word_density punctuation total_length numerics stopwords capitals caps_vs_length num_exclamation_marks num_question_marks num_punctuation num_symbols num_unique_words words_vs_unique word_unique_percent polarity
count 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53757.000000 53682.000000 53682.000000 53757.000000
mean 16.187603 70.323456 0.231637 0.046896 86.505088 0.361851 8.761036 0.642112 0.010120 0.087375 0.022360 0.731440 0.097141 14.837975 0.946749 94.674860 0.160662
std 11.221867 50.669765 0.045952 0.248695 61.714149 1.289668 6.447710 1.330312 0.037344 0.487598 0.187166 1.164304 1.139342 9.213290 0.068301 6.830102 0.300859
min 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.333333 33.333333 -1.000000
25% 9.000000 36.000000 0.205882 0.000000 45.000000 0.000000 4.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 0.904762 90.476190 0.000000
50% 14.000000 60.000000 0.229508 0.000000 74.000000 0.000000 8.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 13.000000 1.000000 100.000000 0.075000
75% 21.000000 92.000000 0.253521 0.000000 113.000000 0.000000 12.000000 1.000000 0.012987 0.000000 0.000000 1.000000 0.000000 19.000000 1.000000 100.000000 0.350000
max 220.000000 1119.000000 0.666667 6.000000 1338.000000 68.000000 102.000000 66.000000 1.000000 22.000000 9.000000 23.000000 154.000000 153.000000 1.000000 100.000000 1.000000
In [48]:
import holoviews as hv
import hvplot.dask
hv.extension('bokeh')
In [49]:
hist=hv.Histogram(np.histogram(data_english['word_count'],bins=100, normed=False),label='Histogram of Word Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='red',width=600, height=380, logy=False, tools=['hover'],xlabel=' Word Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[49]:

from above graph we can see that most of the sentences has words between 5 to 25

In [50]:
hist=hv.Histogram(np.histogram(data_english['char_count'],bins=100, normed=False),label='Histogram of Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='green',width=600,height=380, logy=False, tools=['hover'],xlabel=' Charater Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[50]:

from above graph we can see that most of the sentences has characters between 17 to 110

In [51]:
hist=hv.Histogram(np.histogram(data_english['word_density'],bins=100, normed=False),label='Histogram of Word Density')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='blue',width=600,height=380, logy=False, tools=['hover'],xlabel=' Word Density')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[51]:
In [52]:
hist=hv.Histogram(np.histogram(data_english['total_length'],bins=100, normed=False),label='Histogram of Total Length')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='maroon',width=600,height=380, logy=False, tools=['hover'],xlabel='Total Length')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[52]:

from above graph we can see that most of the sentences has lenth between 21 to 130

In [53]:
a=data_english.numerics.value_counts()
bar=hv.Bars(a,label='Numeric Value Count')
bar.opts(width=600,height=380, logy=False, tools=['hover'],xlabel='Numeric Value', ylabel='Sentences Count')
Out[53]:

Most of the sentences have 0 numeric value, 2453 sentences has 1 numeric value and so on

In [54]:
hist=hv.Histogram(np.histogram(data_english['capitals'],bins=100, normed=False),label='Histogram of Capital Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel=' Capital Charater Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[54]:
In [55]:
a=data_english.num_exclamation_marks.value_counts()
bar=hv.Bars(a,label='Number Exclamation Marks')
bar.opts(color='orange',width=600,height=380, logy=False, tools=['hover'],xlabel='Exclamation Mark count', ylabel='Sentences Count')
Out[55]:
In [56]:
hist=hv.Histogram(np.histogram(data_english['num_unique_words'],bins=100, normed=False),label='Histogram of Unique Words')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='violet',width=600,height=380, logy=False, tools=['hover'],xlabel='Unique Words Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[56]:

Most of the unique words are in 5 to 20

In [63]:
hist=hv.Histogram(np.histogram(data_english['polarity'],bins=100, normed=False),label='Histogram of Sentiment Polarity')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel='Sentiment Polarity')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.
  """Entry point for launching an IPython kernel.
Out[63]:

Vast majority of the sentiment polarity scores are greater than zero, means most of them are pretty positive.

5 random reviews with the highest positive sentiment polarity

In [64]:
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_english.loc[data_english.polarity == 1, ['raw_text']].sample(5).values
for c in cl:
    print(c[0])
5 random reviews with the highest positive sentiment polarity: 

The Nook Tablet, in both the 16gb version and 8 gb version, are the best color e-readers of 2012
  Anyway, everything about Santa Claus Is Comin' To Town is perfect
 I purchased this for my kid and he is very HAPPY
  She was very happy with this product
I wore these in the rain , and they did excellent

5 random reviews with the most neutral sentiment(zero) polarity

In [65]:
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = data_english.loc[data_english.polarity == 0, ['raw_text']].sample(5).values
for c in cl:
    print(c[0])
5 random reviews with the most neutral sentiment(zero) polarity: 

This is the only soap I use (with occasional excursions into pine tar soap, for specific uses
This processor does not work
 I ordered 11
  Of course the grammar I learned in my four years studying Latin made French grammar easier
I was eager to read this book after all of the praise it has received

random reviews with the most negative polarity

In [66]:
print('5 reviews with the most negative polarity: \n')
cl = data_english.loc[data_english.polarity == -1, ['raw_text']].sample(5).values
for c in cl:
    print(c[0])
5 reviews with the most negative polarity: 

 It's terrible, unbearable
 I was stunned at how horrible it looked on
 I felt his pain as his parents died, I understood his awe at Rosie, I felt his disgust at August's disgusting behavior
The battery life is insane
 A figure of peace and justice, betrayed by someone close to him and taken by the forces of evil

Before reviewing Text Features we need to first clean the text data, remove punctuations, stopwords, lemmatize the text data

In [67]:
#Processing Data
#Removing Punctuation and Stopwords
import string
all_punctuations = string.punctuation + '‘’\n,""'':”][]!'

def remove_puctuation(text):
    doc = nlp(str(text))
    no_punctuation = [char for char in text if char not in all_punctuations]
    no_punctuation = ''.join(no_punctuation)
    return no_punctuation

def remove_stop_words(text):
    doc = nlp(str(text))
#     result = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_  for token in doc.sents]
    result=[word for word in text.split() if word not in english_stopwords]
#     mytokens = [ word for word in text.split() if word not in STOP_WORDS]
    return " ".join(result)

#lemmatizing the words i.e converting words to their base words e.g- literals->literal, best->good
def lemmatize(text):
    doc = nlp(str(text))
    return " ".join([token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc])
In [68]:
all_punctuations
Out[68]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’\n,"":”][]!'
In [69]:
# Removing StopWords
data_english['processed_raw_text']=data_english['raw_text'].apply(remove_puctuation)
In [70]:
import plotly.graph_objects as go
import chart_studio.plotly as py
fig = go.Figure(go.Histogram(x=data_english.polarity,marker_color='#EB89B5'))
fig.show()
In [71]:
#Applying Lemmetizing
data_english['processed_raw_text']=data_english['processed_raw_text'].apply(lemmatize)
In [72]:
#Removing Stopwords and creating new column
data_english['processed_wo_stopwords_raw_text']=data_english['processed_raw_text'].apply(remove_stop_words)
In [73]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
text = str(data_english['processed_raw_text'])
wordcloud = WordCloud(max_font_size=100, max_words=1000, background_color="white",width=1200,height=650,colormap="icefire").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

From above WordCloud we can see that PHONE, BOOK, ONE, READ, UPDATE have high frequency i.e. they are occuring more often

we need to extract N-Gram features. N-grams are used to describe the number of words used as observation points, e.g., unigram means singly-worded, bigram means 2-worded phrase, and trigram means 3-worded phrase. In order to do this, we use scikit-learn’s CountVectorizer function.First, it would be interesting to compare unigrams before and after removing stop words.

The distribution of top unigrams before removing stop words

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
In [75]:
def top_words(text):
    vector=CountVectorizer()
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words(data_english['processed_raw_text'])
df= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df=df.head(20)
fig = go.Figure([go.Bar(x=df['ReviewText'], y=df['count'],text=df['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review before removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

The distribution of top unigrams after removing stop words

In [76]:
def top_words_wo_stop(text):
    vector=CountVectorizer(stop_words='english')
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_english['processed_raw_text'])
df1= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df1=df1.head(20)
fig = go.Figure([go.Bar(x=df1['ReviewText'], y=df1['count'],text=df1['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review after removing stop words',)
fig.update_traces(marker_color='blue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

The distribution of top bigrams before removing stop words

In [77]:
def top_words_wo_stop(text):
    vector=CountVectorizer(ngram_range=(2,2))
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_english['processed_raw_text'])
df2= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df2=df2.head(20)
fig = go.Figure([go.Bar(x=df2['ReviewText'], y=df2['count'],text=df2['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review before removing stop words',)
fig.update_traces(marker_color='black', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

The distribution of top bigrams after removing stop words

In [78]:
def top_words_wo_stop(text):
    vector=CountVectorizer(stop_words=english_stopwords,ngram_range=(2,2))
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_english['processed_raw_text'])
df3= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df3=df3.head(20)
fig = go.Figure([go.Bar(x=df3['ReviewText'], y=df3['count'],text=df3['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review after removing stop words',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
C:\Users\dell\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning:

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.

The distribution of Top trigrams before removing stop words

In [79]:
def top_words_wo_stop(text):
    vector=CountVectorizer(ngram_range=(3,3))
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_english['processed_raw_text'])

df4= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df4=df4.head(20)
fig = go.Figure([go.Bar(x=df4['ReviewText'], y=df4['count'],text=df4['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words',)
fig.update_traces(marker_color='steelblue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

The distribution of Top trigrams after removing stop words

In [80]:
def top_words_wo_stop(text):
    vector=CountVectorizer(ngram_range=(3,3),stop_words=english_stopwords)
    bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_english['processed_raw_text'])
df5= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df5=df5.head(20)
fig = go.Figure([go.Bar(x=df5['ReviewText'], y=df5['count'],text=df5['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words')
fig.update_traces(marker_color='rebeccapurple', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

Converting polarity values into POSITIVE , NEUTRAL and NEGATIVE

In [81]:
#into POSITIVE (polarity >= 0.2) & (polarity <= 1) 
#For NEUTRAL (polarity >= 0) & (polarity < 0.2)
#For NEGATIVE (polarity >= -1) & (polarity < 0)
data_english.loc[(data_english.polarity >= 0.2) & (data_english.polarity <= 1),'sentiment'] = 'positive'
data_english.loc[(data_english.polarity >= 0) & (data_english.polarity < 0.2),'sentiment'] = 'neutral'
data_english.loc[(data_english.polarity >= -1) & (data_english.polarity < 0), 'sentiment'] = 'negative'
In [82]:
y=data_english.groupby('sentiment')['processed_raw_text'].count()
y=pd.DataFrame(y)
y.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=y['sentiment'], y=y['processed_raw_text'],text=y['processed_raw_text'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Total number of Sentiment in each category')
fig.update_traces(marker_color='dodgerblue', marker_line_color='black',marker_line_width=1.5, opacity=0.8)
fig.update_xaxes(tickangle=330,title='Sentiment')
fig.update_yaxes(title='Count')
fig.show()

Creating Radar Plot

In [83]:
Data = [
    go.Scatterpolar(
        r = [data_english.loc[data_english["sentiment"] =="positive"]['total_length'].median(),
                data_english.loc[data_english["sentiment"] =="positive"]['word_count'].median(), 
                data_english.loc[data_english["sentiment"] =="positive"]['num_unique_words'].median(),
                data_english.loc[data_english["sentiment"] =="positive"]['caps_vs_length'].median(), 
                 data_english.loc[data_english["sentiment"] =="positive"]['char_count'].median()],
                theta = ['Total_Lenght','WordCount',
                         'Count_unique_words', "Capitals_VS_Length", "Charcount"],
                fill = 'toself', 
                line = dict( color = 'brown'), 
                name=  "Positive Statistics", subplot = "polar"),
        
        go.Scatterpolar(
            r = [data_english.loc[data_english["sentiment"] =="neutral"]['total_length'].median(),
                data_english.loc[data_english["sentiment"] =="neutral"]['word_count'].median(), 
                data_english.loc[data_english["sentiment"] =="neutral"]['num_unique_words'].median(),
                data_english.loc[data_english["sentiment"] =="neutral"]['caps_vs_length'].median(), 
                 data_english.loc[data_english["sentiment"] =="neutral"]['char_count'].median()],
                theta = ['Total_Lenght','WordCount',
                         'Count_unique_words', "Capitals_VS_Length", "Charcount"],
                fill = 'toself', 
                line = dict( color = 'magenta'), 
                name=  "Neutral Statistics", subplot = "polar2"),
        
        go.Scatterpolar(
            r = [data_english.loc[data_english["sentiment"] =="negative"]['total_length'].median(),
                data_english.loc[data_english["sentiment"] =="negative"]['word_count'].median(), 
                data_english.loc[data_english["sentiment"] =="negative"]['num_unique_words'].median(),
                data_english.loc[data_english["sentiment"] =="negative"]['caps_vs_length'].median(), 
                 data_english.loc[data_english["sentiment"] =="negative"]['char_count'].median()],
                theta = ['Total_Lenght','WordCount',
                         'Count_unique_words', "Capitals_VS_Length", "Charcount"],
                fill = 'toself', 
                line = dict( color = 'orange'), 
                name=  "Negative Statistics", subplot = "polar3"),
       ]
layout = go.Layout(
    polar3 = dict(
      domain = dict(
        x = [0, 0.6],
        y = [0.55, 1]
      ),
      radialaxis = dict(visible = True,)),
polar2 = dict(
      domain = dict(
        x = [0, 0.35],
        y = [0, 0.45]
      ),
      radialaxis = dict(visible = True,)),
        
    polar = dict(
      domain = dict(
        x = [0.33, 0.999999],
        y = [0, 0.45]
      ),
      radialaxis = dict(visible = True,)),
    
    title =  "Comapring Median")
fig = go.Figure(data=Data, layout=layout)
fig.show()

Finding NER using SPACY

In [87]:
from spacy import displacy
doc_eng=nlp(str(data_english['processed_raw_text']))
In [88]:
#Removing whitespaces from the data
import re
sample=[]
for sent in doc_eng.sents:
    sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
    print(sent,"\n")
    sample.append(sent)
    
doc2=nlp(str(sample))
<>:5: DeprecationWarning:

invalid escape sequence \s

<>:5: DeprecationWarning:

invalid escape sequence \s

<>:5: DeprecationWarning:

invalid escape sequence \s

<ipython-input-88-e644805875c2>:5: DeprecationWarning:

invalid escape sequence \s

0 spiritually and mentally inspire a book that a... 1  

this be one my must have 

book 2  

it be a masterpiece of spirituality 3  

ill be the first to admit its literary qualit...  

4  

it be rather simplistically write but the mes... 5  

it will take you to enlightenment  

6  

this book provide a reflection that you can ap... 7 and a way for you to try and assess whether yo... 8  

i first read the prophet in college back in th... 9  

the book have a revival as do anything metaph...  

10  

it have a profound effect on me and become a ... 11 after graduation i join the peace corps and d...  

12  

i read it before i marry just before and agai...  

13  

i be always amazed that there be a chapter th... 14  

gibran offer timeless insight and love with ea... 15  

i think that we as a nation should read and l... 16  

it be definitely a time for thought and refle...  

17  

a timeless classic 18  

it be a very demanding and assume title but g...  

19  

if he have the mean to publish it a century o 

... 20  

from the mouth of an old man about to sail awa... 21  

it be a messege 22  

a guide book 23  

a sufi sermon 24  

much be put in perspective without any hint o...  

25  

there be much that hint at his birth place 

le...  

26  

probably becuase it be write in english origin...  

27  

i love the cover 28  

read this make my mind feel like a still pool ...  

29  

its direct and simple wisdom have a depth of ... ... 53727  

the screen which look outstanding inside under... 53728  

so there you have my one gripe 53729  

i will come back and update this if anything c... 53730  

update 40808my wife and i still have both of t...  

53731  

the only slight problem we have have in over 2...  

53732 these phone have preform flawlessly and we use... 53733  

i live in an area where cingular service be re...  

53734 the phone itself be not one that i pick or re...  

53735 my son toss my old phone into the wash and th... 53736  

it be long and wide than my old motorola phon... 53737 i be concerned about the thickness and durabi... 53738 this phone be also metal 

so i feel more confi...  

53739 my old accessory do not fit even though it be ... 53740 0 by avanquest for my previous phone 53741  

i plug my new phone in and the program automa... 53742 0 and it prompt me to add a profile for my cur... 53743 with this program i can make my own ring tone...  

53744  

i also put my favorite picture of my phone du...  

53745  

my bluetooth accessory work better with this p... 53746  

on that phone the connection between the earp... 53747  

i could not hear anyone through the earpiece ...  

53748  

the razr and my jabra work 

well together  

53749  

it work great for voice dialing in my experie...  

53750  

so i be very happy with this phone but in fair...  

53751  

i have a samsung x427 before this 

and it be awful  

53752  

drop call poor reception would not hold a charge 53753  

this phone change my opinion completely 53754  

it do everything i need 

and then some and the... 53755  

the only downside and the difference between ... 53756 but speakerphone be not really what i need it...  

Name: processed_raw_text, 

Length: 53757, dtype: object 

In [89]:
displacy.render(doc2,style='ent',jupyter=True)
['0 spiritually and mentally inspire a book that a... 1 ', 'this be one my must have', 'book 2 CARDINAL ', 'it be a masterpiece of spirituality 3 CARDINAL ', 'ill be the first ORDINAL to admit its literary qualit... ', '4 ', 'it be rather simplistically write but the mes... 5 CARDINAL ', 'it will take you to enlightenment ', ' 6 CARDINAL ', 'this book provide a reflection that you can ap... 7 and a way for you to try and assess whether yo... 8 ', 'i first ORDINAL read the prophet in college back in th... 9 CARDINAL ', 'the book have a revival as do anything metaph... ', ' 10 CARDINAL ', 'it have a profound effect on me and become a ... 11 CARDINAL after graduation i join the peace corps and d... ', ' 12 CARDINAL ', 'i read it before i marry just before and agai... ', ' 13 CARDINAL ', 'i be always amazed that there be a chapter th... 14 CARDINAL ', 'gibran offer timeless insight and love with ea... 15 CARDINAL ', 'i think that we as a nation should read and l... 16 CARDINAL ', 'it be definitely a time for thought and refle... ', ' 17 CARDINAL ', 'a timeless classic 18 CARDINAL ', 'it be a very demanding and assume title but g... ', ' 19 CARDINAL ', 'if he have the mean to publish it a century o', '... 20 CARDINAL ', 'from the mouth of an old man about to sail awa... 21 CARDINAL ', 'it be a messege 22 CARDINAL ', 'a guide book 23 CARDINAL ', 'a sufi sermon 24 CARDINAL ', 'much be put in perspective without any hint o... ', ' 25 CARDINAL ', 'there be much that hint at his birth place', 'le... ', ' 26 CARDINAL ', 'probably becuase it be write in english origin... ', ' 27 CARDINAL ', 'i love the cover 28 CARDINAL ', 'read this make my mind feel like a still pool ... ', ' 29 CARDINAL ', 'its direct and simple wisdom have a depth of ... ... 53727 ', 'the screen which look outstanding inside under... 53728 CARDINAL ', 'so there you have my one CARDINAL gripe 53729 DATE ', 'i will come back and update this if anything c... 53730 DATE ', 'update 40808my wife and i still have both of t... ', ' 53731 CARDINAL ', 'the only slight problem we have have in over 2 CARDINAL ... ', ' 53732 CARDINAL these phone have preform flawlessly and we use... 53733 DATE ', 'i live in an area where cingular service be re... ', ' 53734 CARDINAL the phone itself be not one that i pick or re... ', ' 53735 CARDINAL my son toss my old phone into the wash and th... 53736 ', 'it be long and wide than my old motorola phon... 53737 i be concerned about the thickness and durabi... 53738 CARDINAL this phone be also metal', 'so i feel more confi... ', ' 53739 CARDINAL my old accessory do not fit even though it be ... 53740 0 by avanquest for my previous phone 53741 DATE ', 'i plug my new phone in and the program automa... 53742 0 DATE and it prompt me to add a profile for my cur... 53743 with this program i can make my own ring tone... ', ' 53744 CARDINAL ', 'i also put my favorite picture of my phone du... ', '53745 ', 'my bluetooth accessory work better with this p... 53746 ', 'on that phone the connection between the earp... 53747 ', 'i could not hear anyone through the earpiece ... ', '53748 ', 'the razr and my jabra work', 'well together ', '53749 ', 'it work great for voice dialing in my experie... ', ' 53750 CARDINAL ', 'so i be very happy with this phone but in fair... ', ' 53751 DATE ', 'i have a samsung x427 before this', 'and it be awful ', '53752 ', 'drop call poor reception would not hold a charge 53753 ', 'this phone change my opinion completely 53754 DATE ', 'it do everything i need', 'and then some and the... 53755 DATE ', 'the only downside and the difference between ... 53756 DATE but speakerphone be not really what i need it... ', 'Name: processed_raw_text,', 'Length: 53757 CARDINAL , dtype: object']
In [91]:
text=[]
pos=[]
pos_tag=[]
sentences=[]
for sent in sample:
    parsed_sentence=nlp(str(sample))
    for token in parsed_sentence:
        text.append(token.text)
        pos.append(token.pos_)
        pos_tag.append(token.tag_)
        sentences.append(token)
    
POS=pd.DataFrame({'sentence':sentences,'text':text,'pos':pos,'pos_tag':pos_tag})
# print('The total number of entities detected were:{}'.format(len(Entities)))
POS.head(7)
Out[91]:
sentence text pos pos_tag
0 [ [ PUNCT -LRB-
1 ' ' PUNCT ``
2 0 0 NUM CD
3 spiritually spiritually ADV RB
4 and and CCONJ CC
5 mentally mentally ADV RB
6 inspire inspire VERB VB

Top 20 VERB in the text'

In [92]:
a=POS.groupby(['pos','text'])['pos'].count()
a=POS[POS.pos == 'VERB']
a= a.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
a=pd.DataFrame(a)
a.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=a['text'], y=a['pos'],text=a['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 VERB in the text',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [93]:
b=POS.groupby(['pos','text'])['pos'].count()
b=POS[POS.pos == 'NOUN']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 NOUN in the text',)
fig.update_traces(marker_color='chocolate', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [94]:
c=POS.groupby(['pos','text'])['pos'].count()
c=POS[POS.pos == 'ADJ']
c= c.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
c=pd.DataFrame(c)
c.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=c['text'], y=c['pos'],text=c['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADJ in the text',)
fig.update_traces(marker_color='darkturquoise', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [95]:
d=POS.groupby(['pos','text'])['pos'].count()
d=POS[POS.pos == 'ADV']
d= d.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
d=pd.DataFrame(d)
d.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=d['text'], y=d['pos'],text=d['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADV in the text',)
fig.update_traces(marker_color='olive', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()

Performing Topic Modeling on the dataset

In [96]:
import re 
import gensim 
from gensim import corpora 

# libraries for visualization 
import pyLDAvis 
import pyLDAvis.gensim 
In [97]:
#Creating Tokens
def lemmatization(texts, tags=['NOUN', 'ADJ']): 
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent))       
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
        return output
In [99]:
#Tokenizing sentences
tokenized_reviews = pd.Series(data_english['processed_wo_stopwords_raw_text']).apply(lambda x: x.split())
print(tokenized_reviews[0])
['spiritually', 'mentally', 'inspire', 'book', 'allow', 'question', 'moral', 'help', 'discover']
In [100]:
#creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(tokenized_reviews)
In [ ]:
%%time
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]
# Creating the object for LDA model using gensim library 
LDA = gensim.models.ldamodel.LdaModel 
# Build LDA model 
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary,                                     
                num_topics=5, random_state=100, chunksize=1000,                                     
                passes=30)
In [184]:
lda_model.print_topics()
Out[184]:
[(0,
  '0.024*"year" + 0.015*"battery" + 0.014*"case" + 0.013*"love" + 0.013*"old" + 0.012*"3" + 0.011*"light" + 0.011*"unit" + 0.010*"key" + 0.010*"life"'),
 (1,
  '0.014*"bit" + 0.013*"time" + 0.012*"little" + 0.012*"good" + 0.012*"wear" + 0.010*"hour" + 0.009*"excellent" + 0.008*"plastic" + 0.008*"long" + 0.008*"model"'),
 (2,
  '0.027*"work" + 0.026*"good" + 0.019*"great" + 0.019*"quality" + 0.016*"price" + 0.012*"sound" + 0.011*"ice" + 0.010*"size" + 0.010*"cream" + 0.010*"charger"'),
 (3,
  '0.041*"use" + 0.018*"headset" + 0.013*"tool" + 0.013*"charge" + 0.012*"device" + 0.012*"ear" + 0.011*"bluetooth" + 0.011*"recommend" + 0.010*"5" + 0.009*"problem"'),
 (4,
  '0.051*"phone" + 0.027*"use" + 0.024*"buy" + 0.016*"work" + 0.014*"like" + 0.013*"small" + 0.012*"need" + 0.012*"fit" + 0.012*"try" + 0.011*"great"')]
In [107]:
lda_model.save('model1.gensim')
C:\Users\dell\Anaconda3\lib\site-packages\smart_open\smart_open_lib.py:398: UserWarning:

This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function

In [185]:
pyLDAvis.enable_notebook() 
lda_display = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
C:\Users\dell\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:257: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


Out[185]:

performing Sentiment Analysis on the processed_wo_stopwords_raw_text

In [108]:
features=data_english['processed_wo_stopwords_raw_text']
labels=data_english['sentiment']
In [109]:
from sklearn.model_selection import train_test_split
# Split X and y into training and test set in 70:20 ratio

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=1)
In [110]:
#Creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(x_train)

#Total number of words in the bow_dictionary
len(bow_dictionary.vocabulary_)

#Using the bow_dictionary to create count vectors for the cleaned data.
bow = bow_dictionary.transform(x_train)

#Printing the shape of the bag of words model
print(bow.shape)
(43005, 22370)
In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(x_train)
# summarize
# print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
train_features = vectorizer.transform(x_train)
print(train_features.shape)
# summarize encoded vector
[ 7.95552285 10.97594774 10.97594774 ... 10.57048263 10.97594774
 10.97594774]
(43005, 22370)
In [118]:
test_features = vectorizer.transform(x_test)
In [119]:
from sklearn.naive_bayes import MultinomialNB

#Fitting the training data to the classifier
classifier = MultinomialNB().fit(train_features, y_train)
In [120]:
#Predicting test data
predicted=classifier.predict(test_features)
In [121]:
# Training accuracy
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print("The training accuracy is: ")
print(accuracy_score(y_train, classifier.predict(train_features)))
The training accuracy is: 
0.7413324032089292

From above graph we can see that accuracy of Naive Bayes is 74.5%

In [122]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test,predicted))
The test accuracy is: 
0.6635044642857143
In [123]:
# Classification report
print("Classification report")
print(classification_report(y_test,predicted))
Classification report
              precision    recall  f1-score   support

    negative       0.90      0.09      0.17      1692
     neutral       0.61      0.82      0.70      4842
    positive       0.74      0.71      0.73      4218

    accuracy                           0.66     10752
   macro avg       0.75      0.54      0.53     10752
weighted avg       0.71      0.66      0.63     10752

In [126]:
conf_matrix = confusion_matrix(y_test, predicted)
conf_matrix
Out[126]:
array([[ 158, 1363,  171],
       [  12, 3972,  858],
       [   6, 1208, 3004]], dtype=int64)

Performing the different Statistics operation on Review_text i.e. Indonesian language Dataset

In [192]:
#Importing indonesian Stopwords
from spacy.lang.id import STOP_WORDS
indonesian_stopwords=STOP_WORDS

#Viewing top 15 indonesian stopwords
mylist=list(indonesian_stopwords)
mylist[:15]
Out[192]:
['seringnya',
 'kita',
 'sesekali',
 'seusai',
 'tempat',
 'berlangsung',
 'memisalkan',
 'waktunya',
 'wah',
 'soal',
 'selama',
 'mempunyai',
 'tuturnya',
 'atas',
 'berapapun']
In [128]:
#we have seperated the dataset above
data_indonesia.head(10)
Out[128]:
review_text
0 Menginspirasi secara spiritual dan mental! Buk...
1 Ini adalah salah satu yang harus saya miliki buku
2 Itu adalah mahakarya spiritualitas
3 Saya akan menjadi yang pertama mengakui, kuali...
4 Ini ditulis agak sederhana, tetapi pesan di ba...
5 Ini akan membawa Anda menuju pencerahan
6 Buku ini memberikan refleksi yang dapat Anda t...
7 Dan, cara bagi Anda untuk mencoba dan menilai ...
8 Saya pertama kali membaca THE NABI di pergurua...
9 Buku itu memiliki kebangkitan seperti halnya s...
In [129]:
#function to convert Language
def convert(text):
    a=TextBlob(str(text))
    result=a.translate(to='en')
    return result

#Reviewing top 5 sentences from English and comparing it with Indonesian language
print("top 5 sentences of raw_text is: \n ",data['raw_text'][:10])

print("\n top 5 sentences of review_text is:\n  ",convert(data['review_text'][:10]))
top 5 sentences of raw_text is: 
  0    Spiritually and mentally inspiring! A book tha...
1                       This is one my must have books
2                  It is a masterpiece of spirituality
3     I'll be the first to admit, its literary qual...
4     It is rather simplistically written, but the ...
5                    It will take you to enlightenment
6    This book provides a reflection that you can a...
7    And, a way for you to try and assess whether y...
8    I first read THE PROPHET in college back in th...
9     The book had a revival as did anything metaph...
Name: raw_text, dtype: object

 top 5 sentences of review_text is:
   0 Inspire spiritually and mentally! Buk ...
1 This is one of the must-have books
2 It is a masterpiece of spirituality
3 I will be the first to admit, quality ...
4 It's written rather simply, but the message below ...
5 This will bring you to enlightenment
6 This book gives reflections that you can t ...
7 And, a way for you to try and judge ...
8 I first read THE PROPHET in college ...
9 The book has a resurrection just like s ...
Name: review_text, dtype: object
In [130]:
#finding and dropping rows having NULL values

print('\n before removing null values')
print(data_indonesia.isnull().sum())
data_indonesia.dropna(inplace=True)

print('\n after removing null values')
print(data_indonesia.isnull().sum())
 before removing null values
review_text    75
dtype: int64

 after removing null values
review_text    0
dtype: int64
In [131]:
#Converting Indonesian text to English
english_text=convert(data_indonesia['review_text'])
In [205]:
#correcting text
english_text.correct()
Out[205]:
TextBlob("0 Inspire spiritually and mentally! But ...
1 His is one of the must-have books
2 It is a masterpiece of spirituality
3 I will be the first to admit, quality ...
4 It's written rather simply, but the message below ...
5 His will bring you to enlightenment
6 His book gives reflections that you can t ...
7 And, a way for you to try and judge ...
8 I first read THE PROPHET in college ...
9 The book has a resurrection just like s ...
10 It has a profound effect on me and becomes ...
11 After granulating I joined the Herd Corps ...
12 I read it before I got married, right ...
13 I am always amazed that there is a chapter that reaches ...
14 Fibrin offers insight and lasting love with ...
15 I think we as a nation must read D ...
16 It is definitely time to think and reflect b ...
17 Lifeless classic
18 His is a very demanding and original title ...
19 Of he has the means to publish it ...
20 From the mouth of an old man who will sail ...
21 That is chaos
22 A user guide
23 Puff Sermons
24 Such is put into perspective without ...
25 There are many who are hinting at the birthplace ...
26 Maybe because it's written in English ...
27 I like the cover
28 Leading this makes my mind feel like ...
29 His direct and simple wisdom has ...
                               ...
53727 The screen that looks amazing inside under ...
53728 To there you have my complaint
53729 I will go back and update this if there is ...
53730 UPDATE: 4/08/08 By wife and I still have ...
53731 The only small problem we have and ...
53732 These phone have been set up before ...
53733 I live in an area where Singular services ...
53734 The telephone itself is not one that I ...
53735 By son threw my old telephone to the place ...
53736 It's longer and wider than a telephone ...
53737 I am worried about thickness and disability
53738 His telephone is also made of metal so that ...
53739 By old accessories aren't right even though they are ...
53740 0 by Avanquest for my previous telephone
53741 I am connecting my new phone and program ...
53742 0 and it prompted me to add pro ...
53743 With this program I can make tones der ...
53744 I also included my favorite picture about ...
53745 By Bluetooth accessory works better with ...
53746 In that telephone, the connection between the apiece ...
53747 I can't hear people's voices through ...
53748 By War and Mavra are working fine
53749 His works well for calls through a voice ...
53750 To I'm very happy with this phone but ...
53751 I have a Samsung X427 before this and that me ...
53752 Walls were disconnected, poor reception, no ...
53753 His telephone changed my opinion completely
53754 It does everything I need and some ...
53755 The only downside and the difference between 4 & ...
53756 But the speakerphone isn't really what ...
Same: review_text, Length: 53682, type: object")
In [139]:
import string
def feature(data) :
## Finding the total number of words present in the each sentences
    data['word_count'] = data['review_text'].apply(lambda x : len(x.split()))
#total number of Characters in a sentence
    data['char_count'] = data['review_text'].apply(lambda x : len(x.replace(" ","")))
#calculating word density
    data['word_density'] = data['word_count'] / (data['char_count'] + 1)
#calculating punctuations in sentences
    data['punctuation']=data['review_text'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
# calculating total length of sentence
    data['total_length'] = data['review_text'].apply(len)
#Total number of Numeric characters
    data['numerics']=data['review_text'].apply(lambda x: len([x for x in x if x.isdigit()]))
#findng total number of stopwords in each sentence 
    data['stopwords']=data['review_text'].apply(lambda x:len([x for x in x.lower().split() if x in STOP_WORDS]))
# finding total number of capital letters in sentence
    data['capitals']=data['review_text'].apply(lambda x: len([x for x in x.split(" ") if x.isupper()]))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
#calculating total number of exclamation_marks in sentence
    data['num_exclamation_marks'] =data['review_text'].apply(lambda x: x.count('!'))
#calculating total number of question_marks in sentence
    data['num_question_marks'] = data['review_text'].apply(lambda x: x.count('?'))
#calculating punctuations in sentences
    data['num_punctuation'] = data['review_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
#calculating total number of symbols in sentences
    data['num_symbols'] = data['review_text'].apply(lambda x: sum(x.count(w) for w in '*&#$%'))
#calculating total number of unique words in sentences
    data['num_unique_words'] = data['review_text'].apply(lambda x: len(set(w for w in x.split())))
#calculating words_vs_unique
    data['words_vs_unique'] = data['num_unique_words'] / data['word_count']
#calculating word_unique_percent
    data["word_unique_percent"] =  data["num_unique_words"]*100/data['word_count']
#calculating Sentiment
    data['polarity']=data['review_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
    return data

analyzing Indonesian Language

In [140]:
feature(data_indonesia)
Out[140]:
review_text word_count char_count word_density punctuation total_length numerics stopwords capitals caps_vs_length num_exclamation_marks num_question_marks num_punctuation num_symbols num_unique_words words_vs_unique word_unique_percent polarity
0 Menginspirasi secara spiritual dan mental! Buk... 21 137 0.152174 0 157 0 13 0 0.000000 2 0 0 0 17 0.809524 80.952381 -0.041667
1 Ini adalah salah satu yang harus saya miliki buku 9 41 0.214286 0 49 0 6 0 0.000000 0 0 0 0 9 1.000000 100.000000 0.000000
2 Itu adalah mahakarya spiritualitas 4 31 0.125000 0 34 0 2 0 0.000000 0 0 0 0 4 1.000000 100.000000 0.000000
3 Saya akan menjadi yang pertama mengakui, kuali... 10 63 0.156250 0 72 0 7 0 0.000000 0 0 1 0 10 1.000000 100.000000 0.000000
4 Ini ditulis agak sederhana, tetapi pesan di ba... 14 82 0.168675 0 95 0 8 0 0.000000 0 0 1 0 14 1.000000 100.000000 0.000000
5 Ini akan membawa Anda menuju pencerahan 6 34 0.171429 0 39 0 4 0 0.000000 0 0 0 0 6 1.000000 100.000000 0.000000
6 Buku ini memberikan refleksi yang dapat Anda t... 12 71 0.166667 0 82 0 8 0 0.000000 0 0 0 0 11 0.916667 91.666667 0.000000
7 Dan, cara bagi Anda untuk mencoba dan menilai ... 23 123 0.185484 0 145 0 16 0 0.000000 0 0 1 0 20 0.869565 86.956522 0.000000
8 Saya pertama kali membaca THE NABI di pergurua... 12 57 0.206897 0 68 2 6 2 0.029412 0 0 0 0 11 0.916667 91.666667 0.000000
9 Buku itu memiliki kebangkitan seperti halnya s... 14 85 0.162791 0 98 2 7 0 0.000000 0 0 0 0 13 0.928571 92.857143 0.000000
10 Itu memiliki efek mendalam pada saya dan menja... 13 63 0.203125 0 75 0 8 0 0.000000 0 0 0 0 12 0.923077 92.307692 0.000000
11 Setelah lulus saya bergabung dengan Korps Perd... 31 172 0.179191 0 202 0 17 0 0.000000 0 0 1 0 28 0.903226 90.322581 0.000000
12 Saya membacanya sebelum saya menikah, tepat se... 21 121 0.172131 0 141 0 15 0 0.000000 0 0 1 0 16 0.761905 76.190476 0.000000
13 Saya selalu kagum bahwa ada bab yang menjangka... 19 102 0.184466 0 120 0 11 0 0.000000 0 0 1 0 18 0.947368 94.736842 0.000000
14 Gibran menawarkan wawasan dan cinta abadi deng... 9 52 0.169811 0 60 0 4 0 0.000000 0 0 0 0 9 1.000000 100.000000 0.000000
15 Saya pikir kita sebagai bangsa harus membaca D... 12 67 0.176471 0 78 0 7 1 0.012821 0 0 0 0 12 1.000000 100.000000 0.000000
16 Sudah pasti saat untuk berpikir dan merenung b... 13 70 0.183099 0 82 0 9 0 0.000000 0 0 0 0 13 1.000000 100.000000 0.000000
17 Klasik abadi 2 11 0.166667 0 12 0 0 0 0.000000 0 0 0 0 2 1.000000 100.000000 0.000000
18 Ini adalah judul yang sangat menuntut dan asum... 19 106 0.177570 0 124 0 12 0 0.000000 0 0 1 0 17 0.894737 89.473684 0.000000
19 Jika dia memiliki sarana untuk menerbitkannya ... 16 95 0.166667 0 110 0 9 0 0.000000 0 0 1 0 15 0.937500 93.750000 0.000000
20 Dari mulut seorang lelaki tua yang akan berlay... 21 116 0.179487 0 136 0 10 0 0.000000 0 0 1 0 20 0.952381 95.238095 0.000000
21 Itu adalah kekacauan 3 18 0.157895 0 20 0 2 0 0.000000 0 0 0 0 3 1.000000 100.000000 0.000000
22 Sebuah buku petunjuk 3 18 0.157895 0 20 0 1 0 0.000000 0 0 0 0 3 1.000000 100.000000 0.000000
23 Khotbah Sufi 2 11 0.166667 0 12 0 0 0 0.000000 0 0 0 0 2 1.000000 100.000000 0.000000
24 Banyak yang dimasukkan ke dalam perspektif tan... 10 57 0.172414 0 66 0 7 0 0.000000 0 0 0 0 10 1.000000 100.000000 0.000000
25 Ada banyak yang mengisyaratkan di tempat kelah... 25 133 0.186567 0 157 0 14 0 0.000000 0 0 2 0 20 0.800000 80.000000 0.000000
26 Mungkin karena itu ditulis dalam bahasa Inggri... 24 150 0.158940 0 173 0 10 0 0.000000 0 0 3 0 23 0.958333 95.833333 0.000000
27 Saya menyukai sampulnya 3 21 0.136364 0 23 0 1 0 0.000000 0 0 0 0 3 1.000000 100.000000 0.000000
28 Membaca ini membuat pikiran saya merasa sepert... 15 81 0.182927 0 95 0 7 0 0.000000 0 0 1 0 15 1.000000 100.000000 0.000000
29 Ini kebijaksanaan langsung dan sederhana memil... 19 133 0.141791 0 151 0 8 0 0.000000 0 0 1 0 18 0.947368 94.736842 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
53727 Layar yang terlihat luar biasa di dalam di baw... 32 171 0.186047 0 202 0 23 0 0.000000 0 0 2 0 24 0.750000 75.000000 0.000000
53728 Jadi di sana Anda punya satu keluhan saya 8 34 0.228571 0 41 0 7 0 0.000000 0 0 0 0 8 1.000000 100.000000 0.000000
53729 Saya akan kembali dan memperbarui ini jika ada... 26 135 0.191176 0 160 0 21 1 0.006250 0 0 1 0 21 0.807692 80.769231 0.000000
53730 UPDATE: 4/08 / 08Saya dan istri saya masih mem... 44 244 0.179592 1 287 6 21 1 0.003484 0 0 3 0 36 0.818182 81.818182 0.000000
53731 Satu-satunya masalah kecil yang kami miliki da... 47 253 0.185039 1 299 4 31 0 0.000000 0 0 2 1 41 0.872340 87.234043 0.000000
53732 Telepon-telepon ini telah dibentuk sebelumnya ... 11 81 0.134146 0 91 0 8 0 0.000000 0 0 0 0 11 1.000000 100.000000 0.000000
53733 Saya tinggal di daerah di mana layanan Cingula... 18 98 0.181818 0 115 0 10 0 0.000000 0 0 1 0 16 0.888889 88.888889 0.000000
53734 Ponsel itu sendiri bukan salah satu yang saya ... 13 63 0.203125 0 75 0 8 0 0.000000 0 0 0 0 13 1.000000 100.000000 0.000000
53735 Anak saya melemparkan ponsel lama saya ke temp... 22 122 0.178862 0 143 0 13 0 0.000000 0 0 0 0 19 0.863636 86.363636 0.000000
53736 Ini lebih panjang dan lebih lebar dari ponsel ... 15 74 0.200000 0 88 0 11 0 0.000000 0 0 0 0 13 0.866667 86.666667 0.000000
53737 Saya khawatir tentang ketebalan dan daya tahan 7 40 0.170732 0 46 0 3 0 0.000000 0 0 0 0 7 1.000000 100.000000 0.000000
53738 Ponsel ini juga terbuat dari logam sehingga sa... 21 128 0.162791 0 148 0 13 0 0.000000 0 0 0 0 20 0.952381 95.238095 0.000000
53739 Asesoris lama saya tidak pas walaupun merupaka... 24 138 0.172662 0 161 1 14 0 0.000000 0 1 1 0 23 0.958333 95.833333 0.000000
53740 0 oleh Avanquest untuk ponsel saya sebelumnya 7 39 0.175000 0 45 1 4 0 0.000000 0 0 0 0 7 1.000000 100.000000 0.000000
53741 Saya menghubungkan telepon baru saya dan progr... 12 69 0.171429 0 80 1 6 0 0.000000 0 0 0 0 12 1.000000 100.000000 0.000000
53742 0 dan itu mendorong saya untuk menambahkan pro... 13 65 0.196970 0 77 1 9 0 0.000000 0 0 0 0 11 0.846154 84.615385 0.000000
53743 Dengan program ini saya dapat membuat nada der... 32 184 0.172973 0 215 0 19 0 0.000000 0 0 0 0 26 0.812500 81.250000 0.000000
53744 Saya juga memasukkan gambar favorit saya tenta... 20 122 0.162602 0 141 0 10 0 0.000000 0 0 0 0 16 0.800000 80.000000 0.000000
53745 Aksesori Bluetooth saya berfungsi lebih baik d... 13 84 0.152941 0 96 0 8 0 0.000000 0 0 0 0 12 0.923077 92.307692 0.000000
53746 Di telepon itu, koneksi antara lubang suara da... 13 77 0.166667 0 89 0 5 0 0.000000 0 0 1 0 12 0.923077 92.307692 0.000000
53747 Saya tidak dapat mendengar suara orang melalui... 29 181 0.159341 1 209 0 14 0 0.000000 0 0 1 0 24 0.827586 82.758621 0.000000
53748 Razr dan Jabra saya bekerja dengan baik 7 33 0.205882 0 39 0 5 0 0.000000 0 0 0 0 7 1.000000 100.000000 0.000000
53749 Ini berfungsi baik untuk panggilan melalui sua... 35 196 0.177665 1 230 0 23 0 0.000000 0 0 0 0 31 0.885714 88.571429 0.000000
53750 Jadi saya sangat senang dengan ponsel ini teta... 23 130 0.175573 0 152 0 14 0 0.000000 0 0 0 0 19 0.826087 82.608696 0.000000
53751 Saya punya Samsung X427 sebelum ini dan itu me... 9 46 0.191489 0 54 3 6 1 0.018519 0 0 0 0 9 1.000000 100.000000 0.000000
53752 Panggilan terputus, penerimaan yang buruk, tid... 9 61 0.145161 0 69 0 3 0 0.000000 0 0 2 0 9 1.000000 100.000000 0.000000
53753 Ponsel ini mengubah pendapat saya sepenuhnya 6 39 0.150000 0 44 0 2 0 0.000000 0 0 0 0 6 1.000000 100.000000 0.000000
53754 Itu melakukan semua yang saya butuhkan dan beb... 14 71 0.194444 1 84 0 8 0 0.000000 0 0 1 0 13 0.928571 92.857143 0.000000
53755 Satu-satunya downside dan perbedaan antara 4 &... 29 160 0.180124 3 188 2 12 0 0.000000 0 0 0 1 25 0.862069 86.206897 -0.100000
53756 Tetapi speakerphone tidak benar-benar apa yang... 12 75 0.157895 0 86 0 7 0 0.000000 0 0 1 0 12 1.000000 100.000000 0.000000

53682 rows × 18 columns

In [141]:
hist=hv.Histogram(np.histogram(data_indonesia['word_count'],bins=100, normed=False),label='Histogram of Word Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='red',width=600, height=380, logy=False, tools=['hover'],xlabel=' Word Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[141]:
In [142]:
hist=hv.Histogram(np.histogram(data_indonesia['char_count'],bins=100, normed=False),label='Histogram of Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='green',width=600,height=380, logy=False, tools=['hover'],xlabel=' Charater Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[142]:
In [143]:
hist=hv.Histogram(np.histogram(data_indonesia['word_density'],bins=100, normed=False),label='Histogram of Word Density')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='blue',width=600,height=380, logy=False, tools=['hover'],xlabel=' Word Density')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[143]:
In [144]:
hist=hv.Histogram(np.histogram(data_indonesia['total_length'],bins=100, normed=False),label='Histogram of Total Length')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='maroon',width=600,height=380, logy=False, tools=['hover'],xlabel='Total Length')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[144]:
In [145]:
a=data_indonesia.numerics.value_counts()
bar=hv.Bars(a,label='Numeric Value Count')
bar.opts(width=600,height=380, logy=False, tools=['hover'],xlabel='Numeric Value', ylabel='Sentences Count')
Out[145]:
In [146]:
hist=hv.Histogram(np.histogram(data_indonesia['capitals'],bins=100, normed=False),label='Histogram of Capital Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel=' Capital Charater Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[146]:
In [147]:
a=data_indonesia.num_exclamation_marks.value_counts()
bar=hv.Bars(a,label='Number Exclamation Marks')
bar.opts(color='orange',width=600,height=380, logy=False, tools=['hover'],xlabel='Exclamation Mark count', ylabel='Sentences Count')
Out[147]:
In [148]:
hist=hv.Histogram(np.histogram(data_indonesia['num_unique_words'],bins=100, normed=False),label='Histogram of Unique Words')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='violet',width=600,height=380, logy=False, tools=['hover'],xlabel='Unique Words Count')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[148]:
In [149]:
hist=hv.Histogram(np.histogram(data_indonesia['polarity'],bins=100, normed=False),label='Histogram of Sentiment Polarity')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel='Sentiment Polarity')
C:\Users\dell\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:

Passing normed=False is deprecated, and has no effect. Consider passing the density argument instead.

Out[149]:
In [150]:
#5 random reviews with the highest positive sentiment polarity

print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == 1, ['review_text']].sample(5).values
for c in cl:
    print(c[0])
5 random reviews with the highest positive sentiment polarity: 

Ketika saya masih muda, frasa seperti "The Greatest Story Ever Told" tampak seperti slogan iklan dan saya mengabaikannya
Ada banyak buku yang tersedia bagi mereka yang tidak terbiasa memasak tekanan seperti Pressure Perfect: Selera Dua Jam dalam Dua Puluh Menit Menggunakan Pressure Cooker Anda dan banyak lainnya
The Nook Color melakukan pekerjaan yang WONDERFUL sebagai e-reader tidak hanya buku, tetapi juga majalah dan surat kabar, dan memiliki beberapa lonceng dan peluit menyenangkan lainnya.
Kalau tidak, tidak dapat mengalahkannya untuk harga seperti di Target / Best Buy dll
ini adalah kualitas terbaik dengan harga AWESOME
In [151]:
#5 random reviews with the highest neutral sentiment polarity

print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == 0, ['review_text']].sample(5).values
for c in cl:
    print(c[0])
5 random reviews with the highest positive sentiment polarity: 

Seluruh alasan film ini ada adalah untuk menunjukkan apa yang Kristus alami
Saya suka halaman ini menggeser fitur geser
Pengecorannya sangat bagus dan musik serta temanya lawas tetapi sangat relevan
Saya punya beberapa alat
Direkomendasikan oleh kakak saya dan sekarang saya juga bisa
In [152]:
#5 random reviews with the highest negative sentiment polarity

print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == -1, ['review_text']].sample(3).values
for c in cl:
    print(c[0])
5 random reviews with the highest positive sentiment polarity: 

TETAPI menggunakan Rosetta, saya tidak hanya bisa melewati kelas saya tanpa terdengar dan merasa seperti orang idiot, tapi saya benar-benar mendapat nilai A !! Saya tidak pernah mengira itu suatu kemungkinan
Jika terlalu brutal, terlalu kejam, terlalu gamblang, maka para penonton itu harus nyata! Tidak ada yang gratis, berlebihan, atau berlebihan
TETAPI menggunakan Rosetta, saya tidak hanya bisa melewati kelas saya tanpa terdengar dan merasa seperti orang idiot, tapi saya benar-benar mendapat nilai A !! Saya tidak pernah mengira itu suatu kemungkinan
In [153]:
# Removing StopWords
data_indonesia['processed_raw_text']=data_indonesia['review_text'].apply(remove_puctuation)
In [154]:
#Applying Lemmetizing
data_indonesia['processed_raw_text']=data_indonesia['processed_raw_text'].apply(lemmatize)
In [155]:
#Removing Stopwords and creating new column

def remove_stop_words(text):
    doc = nlp(str(text))
#     result = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_  for token in doc.sents]
    result=[word for word in text.split() if word not in indonesian_stopwords]
#     mytokens = [ word for word in text.split() if word not in STOP_WORDS]
    return " ".join(result)

data_indonesia['processed_wo_stopwords_raw_text']=data_indonesia['processed_raw_text'].apply(remove_stop_words)
In [156]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
text = str(data_indonesia['processed_raw_text'])
wordcloud = WordCloud(max_font_size=100, max_words=1000, background_color="white",width=1200,height=650,colormap="icefire").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

The distribution of top unigrams before removing stop words

In [157]:
def top_words(text):
    vector=CountVectorizer()
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words(data_indonesia['processed_raw_text'])
df6= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df6=df6.head(20)
fig = go.Figure([go.Bar(x=df6['ReviewText'], y=df6['count'],text=df6['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review before removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [158]:
def top_words_wo_stop(text):
    vector=CountVectorizer(stop_words=indonesian_stopwords)
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df7= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df7=df7.head(20)
fig = go.Figure([go.Bar(x=df7['ReviewText'], y=df7['count'],text=df7['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review after removing stop words',)
fig.update_traces(marker_color='blue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
C:\Users\dell\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:300: UserWarning:

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['baiknya', 'berkali', 'kali', 'kurangnya', 'mata', 'olah', 'sekurang', 'setidak', 'tama', 'tidaknya'] not in stop_words.

In [159]:
def top_words_wo_stop(text):
    vector=CountVectorizer(ngram_range=(2,2))
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df8= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df8=df8.head(20)
fig = go.Figure([go.Bar(x=df8['ReviewText'], y=df8['count'],text=df8['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review before removing stop words',)
fig.update_traces(marker_color='gray', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [160]:
def top_words_wo_stop(text):
    vector=CountVectorizer(stop_words=indonesian_stopwords,ngram_range=(2,2))
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df9= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df9=df9.head(20)
fig = go.Figure([go.Bar(x=df9['ReviewText'], y=df9['count'],text=df9['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review after removing stop words',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [161]:
def top_words_wo_stop(text):
    vector=CountVectorizer(ngram_range=(3,3))
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df10= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df10=df10.head(20)
fig = go.Figure([go.Bar(x=df10['ReviewText'], y=df10['count'],text=df10['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review before removing stop words',)
fig.update_traces(marker_color='orange', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [162]:
def top_words_wo_stop(text):
    vector=CountVectorizer(stop_words=indonesian_stopwords,ngram_range=(3,3))
    bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
    sum_of_words= bag_of_words.sum(axis=0)
    frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
    return frequency

words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df10= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df10=df10.head(20)
fig = go.Figure([go.Bar(x=df10['ReviewText'], y=df10['count'],text=df10['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [163]:
#into POSITIVE (polarity >= 0.2) & (polarity <= 1) 
#For NEUTRAL (polarity >= 0) & (polarity < 0.2)
#For NEGATIVE (polarity >= -1) & (polarity < 0)
data_indonesia.loc[(data_indonesia.polarity >= 0.2) & (data_indonesia.polarity <= 1),'sentiment'] = 'positive'
data_indonesia.loc[(data_indonesia.polarity >= 0) & (data_indonesia.polarity < 0.2),'sentiment'] = 'neutral'
data_indonesia.loc[(data_indonesia.polarity >= -1) & (data_indonesia.polarity < 0), 'sentiment'] = 'negative'

y=data_indonesia.groupby('sentiment')['processed_raw_text'].count()
y=pd.DataFrame(y)
y.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=y['sentiment'], y=y['processed_raw_text'],text=y['processed_raw_text'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Total number of Sentiment in each category')
fig.update_traces(marker_color='dodgerblue', marker_line_color='black',marker_line_width=1.5, opacity=0.8)
fig.update_xaxes(tickangle=330,title='Sentiment')
fig.update_yaxes(title='Count')
fig.show()
In [164]:
#Removing whitespaces from the data
import re

doc_indo=nlp(str(data_indonesia['processed_raw_text']))
sample=[]
for sent in doc_indo.sents:
    sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
    print(sent,"\n")
    sample.append(sent)
    
<>:7: DeprecationWarning:

invalid escape sequence \s

<>:7: DeprecationWarning:

invalid escape sequence \s

<>:7: DeprecationWarning:

invalid escape sequence \s

<ipython-input-164-456d294347eb>:7: DeprecationWarning:

invalid escape sequence \s

0  

menginspirasi 

secara spiritual dan mental buku...  

1  

ini 

adalah salah 

satu yang 

harus saya miliki buku 2  

itu adalah mahakarya spiritualitas 3  

saya akan menjadi yang pertama mengakui kualit...  

4  

ini 

ditulis agak sederhana 

tetapi pesan di 

bal... 5  

ini akan membawa 

anda menuju pencerahan 6  

buku ini 

memberikan 

refleksi 

yang 

dapat 

anda t... 7  

dan 

cara bagi anda untuk 

mencoba dan menilai a 

... 8 saya pertama kali membaca the nabi di 

pergurua...  

9 buku itu memiliki kebangkitan seperti halnya 

s... 10  

itu memiliki efek mendalam pada saya dan menja...  

11  

setelah 

lulus saya bergabung dengan korps perd... 12  

saya membacanya sebelum 

saya menikah 

tepat seb...  

13  

saya selalu kagum 

bahwa ada bab yang menjangka...  

14  

gibran 

menawarkan wawasan dan cinta abadi deng... 15  

saya pikir 

kita sebagai 

bangsa harus membaca d...  

16  

sudah 

pasti saat untuk 

berpikir dan 

merenung b... 17  

klasik abadi 18  

ini 

adalah 

judul yang sangat menuntut dan asum... 19  

jika dia memiliki sarana untuk menerbitkannya ...  

20  

dari 

mulut seorang 

lelaki tua yang akan berlay...  

21  

itu adalah kekacauan 22  

sebuah buku petunjuk 23  

khotbah sufi  

24  

banyak yang dimasukkan 

ke 

dalam 

perspektif tan...  

25  

ada banyak yang mengisyaratkan di tempat kelah...  

26  

mungkin 

karena itu ditulis 

dalam bahasa inggri...  

27  

saya menyukai sampulnya  

28  

membaca ini membuat pikiran saya merasa sepert... 29  

ini 

kebijaksanaan 

langsung dan sederhana memil... ...  

53727  

layar yang terlihat luar biasa di dalam di 

baw...  

53728  

jadi di sana anda punya satu keluhan saya 53729  

saya akan kembali dan memperbarui 

ini jika ada...  

53730 update 408 08saya 

dan 

istri 

saya masih memili...  

53731  

satusatunya masalah kecil 

yang kami miliki dal...  

53732  

telepontelepon ini 

telah dibentuk sebelumnya 

d...  

53733  

saya tinggal di daerah di mana layanan 

cingula...  

53734  

ponsel itu sendiri bukan salah satu yang saya ...  

53735 anak saya melemparkan ponsel lama saya ke temp...  

53736  

ini lebih panjang dan lebih lebar dari ponsel ... 53737  

saya khawatir tentang ketebalan dan daya tahan 53738  

ponsel ini 

juga terbuat dari logam sehingga sa...  

53739  

asesoris lama saya tidak 

pas walaupun merupaka... 53740 0 

oleh 

avanquest untuk 

ponsel saya sebelumnya 53741  

saya menghubungkan 

telepon baru saya dan progr... 53742 0 dan itu mendorong saya untuk menambahkan pro...  

53743  

dengan program ini 

saya dapat membuat nada der...  

53744  

saya juga memasukkan gambar favorit saya tenta...  

53745  

aksesori bluetooth saya berfungsi lebih baik d...  

53746  

di 

telepon itu koneksi antara lubang suara dan...  

53747  

saya tidak 

dapat mendengar suara orang melalui...  

53748  

razr dan jabra saya bekerja dengan baik 53749  

ini berfungsi baik untuk panggilan melalui sua...  

53750  

jadi saya sangat senang dengan ponsel ini teta...  

53751  

saya punya samsung 

x427 sebelum ini dan itu me...  

53752  

panggilan terputus 

penerimaan 

yang 

buruk tidak... 53753  

ponsel ini mengubah pendapat saya sepenuhnya 53754  

itu melakukan semua yang saya butuhkan dan beb...  

53755  

satusatunya downside dan perbedaan antara 4  

5... 53756  

tetapi speakerphone tidak benarbenar apa yang ...  

Name: processed_raw_text, Length: 53682, dtype: object 

In [165]:
text=[]
pos=[]
pos_tag=[]
sentences=[]
for sent in sample:
    parsed_sentence=nlp(str(sample))
    for token in parsed_sentence:
        text.append(token.text)
        pos.append(token.pos_)
        pos_tag.append(token.tag_)
        sentences.append(token)
    
POS2=pd.DataFrame({'sentence':sentences,'text':text,'pos':pos,'pos_tag':pos_tag})
# print('The total number of entities detected were:{}'.format(len(Entities)))
POS2.head()
Out[165]:
sentence text pos pos_tag
0 [ [ PUNCT -LRB-
1 ' ' PUNCT ``
2 0 0 NUM CD
3 ' ' PUNCT ''
4 , , PUNCT ,
In [166]:
b=POS2.groupby(['pos','text'])['pos'].count()
b=POS2[POS2.pos == 'NOUN']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 NOUN in the text',)
fig.update_traces(marker_color='chocolate', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [167]:
b=POS2.groupby(['pos','text'])['pos'].count()
b=POS2[POS2.pos == 'ADJ']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADJ in the text',)
fig.update_traces(marker_color='fuchsia', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
In [168]:
#Creating Tokens
def lemmatization(texts, tags=['NOUN', 'ADJ','ADV']): 
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent))       
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
        return output
In [176]:
#Tokenizing sentences
tokenized_reviews2 = pd.Series(data_indonesia['processed_wo_stopwords_raw_text']).apply(lambda x: x.split())
print(tokenized_reviews2[0])
['menginspirasi', 'spiritual', 'mental', 'buku', 'moral', 'membantu', 'menemukan']
In [170]:
#creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary2 = corpora.Dictionary(tokenized_reviews2)
In [ ]:
doc_term_matrix2 = [dictionary2.doc2bow(rev) for rev in tokenized_reviews2]
# Creating the object for LDA model using gensim library 
LDA = gensim.models.ldamodel.LdaModel 
# Build LDA model 
lda_model2 = LDA(corpus=doc_term_matrix2, id2word=dictionary2,                                     
                num_topics=5, random_state=100, chunksize=1000,                                     
                passes=15)
In [182]:
lda_model2.print_topics()
Out[182]:
[(0,
  '0.028*"daya" + 0.020*"alat" + 0.015*"murah" + 0.012*"1" + 0.012*"krim" + 0.010*"pengisi" + 0.010*"memiliki" + 0.009*"baterai" + 0.009*"layar" + 0.008*"volume"'),
 (1,
  '0.032*"ponsel" + 0.027*"memiliki" + 0.023*"headset" + 0.019*"pisau" + 0.018*"membeli" + 0.015*"perangkat" + 0.013*"berfungsi" + 0.011*"bluetooth" + 0.010*"kabel" + 0.010*"telinga"'),
 (2,
  '0.017*"cepat" + 0.016*"sabun" + 0.013*"es" + 0.010*"ulasan" + 0.010*"mencoba" + 0.010*"kali" + 0.009*"jam" + 0.009*"pas" + 0.009*"menit" + 0.008*"fitur"'),
 (3,
  '0.049*"bagus" + 0.048*"telepon" + 0.020*"menggunakannya" + 0.017*"harga" + 0.015*"salah" + 0.013*"hebat" + 0.013*"orang" + 0.013*"3" + 0.012*"terbaik" + 0.012*"produk"'),
 (4,
  '0.023*"suara" + 0.018*"2" + 0.018*"mudah" + 0.018*"memiliki" + 0.014*"5" + 0.013*"kunci" + 0.012*"buruk" + 0.011*"ukuran" + 0.010*"unit" + 0.010*"sempurna"')]
In [179]:
lda_model2.save('model2.gensim')
C:\Users\dell\Anaconda3\lib\site-packages\smart_open\smart_open_lib.py:398: UserWarning:

This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function

In [183]:
pyLDAvis.enable_notebook() 
lda_display = pyLDAvis.gensim.prepare(lda_model2, doc_term_matrix2, dictionary2, sort_topics=False)
pyLDAvis.display(lda_display)
C:\Users\dell\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:257: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


Out[183]:
In [ ]: